In [1]:
from setup_notebooks import *
%matplotlib inline
In [2]:
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)
In [29]:
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary
from collections import OrderedDict
Load previously cleaned data
In [3]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
df = pd.read_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df.tokens
Out[3]:
In [9]:
d = Dictionary.from_documents(([str(s) for s in row]for row in df.tokens))
In [4]:
df.tokens.iloc[0]
Out[4]:
In [ ]:
# one way to fix this
df.tokens = df.tokens.apply(eval)
In [13]:
df['tokens'] = df.txt.str.split()
df.tokens
Out[13]:
In [16]:
df.tokens.values[0:3]
Out[16]:
In [17]:
d = Dictionary.from_documents(df.tokens)
d
Out[17]:
In [18]:
tfidf = TfidfModel(d)
Hint-Hint: gensim
is sprinting this week at PyCon!
In [19]:
TfidfModel?
In [20]:
TfidfModel(df.txt)
In [21]:
TfidfModel(df.tokens)
In [10]:
TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))
But there's a simpler way.
We already have a vocabulary
with term and document frequencies in a matrix...
In [15]:
pd.Series(d.dfs)
Out[15]:
In [16]:
pd.Series(d.iteritems())
Out[16]:
OK, now I get it
document
is a list of strings (ordered sequence of tokens) bow
or [bag of words] is a list of Counter
-like mappings between word IDs and their count in each documentTfidfModel
is a transformation from a BOW into a BORF, a "bag of relative frequencies" TFIDF = BORF = term frequencies normalized by document occurence counts
In [21]:
pd.Series(d.doc2bow(toks) for toks in df.tokens[:6])
Out[21]:
Did it assign 0 to the first word it found?
Sort-of...
In [22]:
d.token2id['python']
Out[22]:
In [23]:
d.token2id['Python']
Out[23]:
In [24]:
d.token2id['you']
Out[24]:
In [26]:
d[1] # guesses anyone?
Out[26]:
In [ ]:
In [27]:
tfidf = TfidfModel(dictionary=d)
tfidf
Out[27]:
In [30]:
dfs = pd.Series(OrderedDict(sorted([(d.id2token[i], numdocs) for (i, numdocs) in tfidf.dfs.items()])))
dfs
Out[30]:
In [27]:
dfs.iloc[4000:4030]
Out[27]:
In [28]:
tfidf.num_docs
Out[28]:
In [29]:
tfidf.num_nnz
Out[29]:
In [30]:
tfidf.save(os.path.join(DATA_PATH, 'tfidf'))
In [31]:
tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))
In [32]:
tfidf2.num_nnz
Out[32]:
In [ ]: